notebook.community

Edit and run



In [1]:

    
import gzip
import json
import pandas as pd

from collections import defaultdict, Counter



In [2]:

    
%%time
data = []
media_types = defaultdict(int)
url_types = defaultdict(int)
has_urls = 0
unique_urls = set()
with gzip.open("all_ids.txt.json.gz") as fp:
    for line in fp:
        d = json.loads(line.strip())
        data.append(d)
        if 'entities' not in d:
            continue
        if 'media' in d['entities']:
            m_entities = d['entities']['media']
            for m in m_entities:
                m_type = m['type']
                media_types[m_type] += 1
        if 'urls' in d['entities']:
            m_entities = d['entities']['urls']
            if len(m_entities) > 0:
                has_urls += 1
            for m in m_entities:
                media_types['url'] += 1
                m = m['expanded_url']
                m_type = m.split("/", 3)[2]
                unique_urls.add((m, m_type))
                url_types[m_type] += 1
                
print(media_types)
url_types = Counter(url_types)
print("Of {} tweets, {} contain a total of {} urls with {} unique domains and {} unique urls".format(
        len(data), has_urls, media_types["url"], len(url_types), len(unique_urls)))









    



defaultdict(<type 'int'>, {'url': 166670, u'photo': 27682})
Of 328318 tweets, 162032 contain a total of 166670 urls with 8750 unique domains and 119558 unique urls
CPU times: user 1min 6s, sys: 4.46 s, total: 1min 11s
Wall time: 1min 11s



In [3]:

    
url_types.most_common(50)









    Out[3]:





[(u'twitter.com', 24978),
 (u'bit.ly', 15148),
 (u'fb.me', 15069),
 (u'ow.ly', 6866),
 (u'dlvr.it', 5398),
 (u'ift.tt', 4693),
 (u'goo.gl', 4039),
 (u'ln.is', 3795),
 (u'youtu.be', 3784),
 (u'gvwy.io', 3120),
 (u'www.instagram.com', 2761),
 (u'buff.ly', 2331),
 (u'www.newsweek.com', 1949),
 (u'www.youtube.com', 1170),
 (u'nyti.ms', 1119),
 (u'tinyurl.com', 1083),
 (u'wp.me', 1048),
 (u'm.tbnn.it', 960),
 (u'shar.es', 845),
 (u'www.naturalnews.com', 798),
 (u'warontherocks.com', 739),
 (u'truthinmedia.com', 677),
 (u'cnn.it', 670),
 (u'rover.ebay.com', 604),
 (u'dld.bz', 524),
 (u'www.periscope.tv', 515),
 (u'lnkd.in', 504),
 (u'www.huffingtonpost.com', 486),
 (u'b.autovist.com', 473),
 (u'fxn.ws', 468),
 (u'www.breitbart.com', 461),
 (u'www.facebook.com', 421),
 (u'www.nytimes.com', 416),
 (u'n.pr', 405),
 (u'www.infowars.com', 404),
 (u'a.msn.com', 397),
 (u'thefederalist.com', 385),
 (u'apple.news', 379),
 (u'go.shr.lc', 378),
 (u'NaturalNews.com', 373),
 (u'www.foxnews.com', 362),
 (u'wpo.st', 350),
 (u'pinterest.com', 346),
 (u'www.cnn.com', 325),
 (u'www.yahoo.com', 319),
 (u'amzn.to', 317),
 (u'on.mash.to', 316),
 (u'wapo.st', 314),
 (u'brev.is', 310),
 (u'j.mp', 305)]



In [4]:

    
sorted(unique_urls,
                      key=lambda x: url_types[x[1]],
                     reverse=True)[:10]









    Out[4]:





[(u'https://twitter.com/i/web/status/787248028335808513', u'twitter.com'),
 (u'https://twitter.com/mr_dsantos/status/792410135582875648', u'twitter.com'),
 (u'https://twitter.com/i/web/status/789400744810024960', u'twitter.com'),
 (u'https://twitter.com/candy_lass/status/692590229069254656', u'twitter.com'),
 (u'https://twitter.com/i/web/status/791387309992280064', u'twitter.com'),
 (u'https://twitter.com/_ijmtybx/status/743864533089947648', u'twitter.com'),
 (u'https://twitter.com/i/web/status/784460833912975360', u'twitter.com'),
 (u'https://twitter.com/i/web/status/792218124707729408', u'twitter.com'),
 (u'https://twitter.com/CaptainCreole/status/798946586730659840',
  u'twitter.com'),
 (u'https://twitter.com/tazerblack/status/786997527560224769', u'twitter.com')]

Run code to get all URLs

with open("all_urls.txt", "wb+") as fp:
    for url in sorted(filter(lambda x: x[1] != 'twitter.com',
            unique_urls),
                      key=lambda x: url_types[x[1]],
                     reverse=True):
        print >> fp, "%s\t%s\t%s" % (url[0], url[1], url_types[url[1]])

! head all_urls.txt



In [5]:

    
len(data)









    Out[5]:





328318



In [6]:

    
data[0].keys()









    Out[6]:





[u'contributors',
 u'truncated',
 u'text',
 u'is_quote_status',
 u'in_reply_to_status_id',
 u'id',
 u'favorite_count',
 u'source',
 u'quoted_status_id',
 u'retweeted',
 u'coordinates',
 u'quoted_status',
 u'entities',
 u'in_reply_to_screen_name',
 u'id_str',
 u'retweet_count',
 u'in_reply_to_user_id',
 u'favorited',
 u'user',
 u'geo',
 u'in_reply_to_user_id_str',
 u'possibly_sensitive',
 u'lang',
 u'created_at',
 u'quoted_status_id_str',
 u'in_reply_to_status_id_str',
 u'place']



In [7]:

    
data[0][u'source']









    Out[7]:





u'<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>'



In [8]:

    
data[0][u'is_quote_status']









    Out[8]:





True



In [9]:

    
data[0][u'quoted_status']['text']









    Out[9]:





u'Overnight apartment fire in Tampa #10News https://t.co/gDBsG8udFg'



In [10]:

    
data[0]['text']









    Out[10]:





u'Getting a better look at the damage now that the sun is up.  Very sad https://t.co/DZrhrubgf9'



In [11]:

    
count_quoted = 0
has_coordinates = 0
count_replies = 0
language_ids = defaultdict(int)
count_user_locs = 0
user_locs = Counter()
count_verified = 0
for d in data:
    count_quoted += d.get('is_quote_status', 0)
    coords = d.get(u'coordinates', None)
    repl_id = d.get(u'in_reply_to_status_id', None)
    has_coordinates += (coords is not None)
    count_replies += (repl_id is not None)
    loc = d['user'].get('location', u'')
    count_verified += d['user']['verified']
    if loc != u'':
        count_user_locs += 1
        user_locs.update([loc])
    language_ids[d['lang']] += 1
    
print count_quoted, has_coordinates, count_replies, count_user_locs, count_verified
print("Of {} tweets, {} have coordinates, while {} have user locations, comprising of {} unique locations".format(
        len(data), has_coordinates, count_user_locs, len(user_locs)
    ))









    



21382 646 53296 281811 11366
Of 328318 tweets, 646 have coordinates, while 281811 have user locations, comprising of 52421 unique locations



In [12]:

    
user_locs.most_common(10)









    Out[12]:





[(u'United States', 10420),
 (u'USA', 7880),
 (u'Washington, DC', 4310),
 (u'New York, NY', 3082),
 (u'California, USA', 3018),
 (u'Los Angeles, CA', 2719),
 (u'New York', 2312),
 (u'Chicago, IL', 2179),
 (u'New York, USA', 2021),
 (u'Texas', 1773)]



In [13]:

    
len(data)









    Out[13]:





328318



In [14]:

    
data[0]['user']









    Out[14]:





{u'contributors_enabled': False,
 u'created_at': u'Tue Jul 14 00:13:13 +0000 2009',
 u'default_profile': False,
 u'default_profile_image': False,
 u'description': u'Executive Producer at 10News WTSP in Tampa/St. Petersburg. Indiana University graduate.',
 u'entities': {u'description': {u'urls': []}},
 u'favourites_count': 345,
 u'follow_request_sent': False,
 u'followers_count': 573,
 u'following': False,
 u'friends_count': 503,
 u'geo_enabled': True,
 u'has_extended_profile': False,
 u'id': 56544119,
 u'id_str': u'56544119',
 u'is_translation_enabled': False,
 u'is_translator': False,
 u'lang': u'en',
 u'listed_count': 68,
 u'location': u'St. Petersburg',
 u'name': u'Melissa Ramsey',
 u'notifications': False,
 u'profile_background_color': u'0099B9',
 u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme4/bg.gif',
 u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme4/bg.gif',
 u'profile_background_tile': False,
 u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/56544119/1443718335',
 u'profile_image_url': u'http://pbs.twimg.com/profile_images/743866585635491840/Pa-vBAru_normal.jpg',
 u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/743866585635491840/Pa-vBAru_normal.jpg',
 u'profile_link_color': u'0099B9',
 u'profile_sidebar_border_color': u'5ED4DC',
 u'profile_sidebar_fill_color': u'95E8EC',
 u'profile_text_color': u'3C3940',
 u'profile_use_background_image': True,
 u'protected': False,
 u'screen_name': u'mramsey8',
 u'statuses_count': 1010,
 u'time_zone': u'Central Time (US & Canada)',
 u'translator_type': u'none',
 u'url': None,
 u'utc_offset': -21600,
 u'verified': False}

Load expanded data



In [15]:

    
df = pd.read_csv("URL_CAT_MAPPINGS.txt", sep="\t")
df.head()









    Out[15]:






  
    
      
      URL
      EXPANDED
      EXPANDED_STATUS
      URL_DOMAIN
      URL_CATS
    
  
  
    
      0
      http://www.investmentnews.com/article/20160801...
      http://www.investmentnews.com/article/20160801...
      0
      investmentnews.com
      UNK
    
    
      1
      http://ow.ly/3avNPe
      https://www.reddit.com/r/cahideas/comments/42i...
      0
      reddit.com
      socialmedia
    
    
      2
      http://stratcom.kma-assc.com/uncategorized/pre...
      http://stratcom.kma-assc.com/uncategorized/pre...
      3
      stratcom.kma-assc.com
      UNK
    
    
      3
      http://ln.is/mabelsaveforschool.com/gbEtv
      http://linkis.com/mabelsaveforschool.com/gbEtv
      0
      mabelsaveforschool.com
      commercial
    
    
      4
      http://kiw.im/16LfJirkfzE
      https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927
      0
      kiwi.qa
      UNK



In [16]:

    
df['URL_EXP_SUCCESS'] = (df.EXPANDED_STATUS < 2)
df.head()









    Out[16]:






  
    
      
      URL
      EXPANDED
      EXPANDED_STATUS
      URL_DOMAIN
      URL_CATS
      URL_EXP_SUCCESS
    
  
  
    
      0
      http://www.investmentnews.com/article/20160801...
      http://www.investmentnews.com/article/20160801...
      0
      investmentnews.com
      UNK
      True
    
    
      1
      http://ow.ly/3avNPe
      https://www.reddit.com/r/cahideas/comments/42i...
      0
      reddit.com
      socialmedia
      True
    
    
      2
      http://stratcom.kma-assc.com/uncategorized/pre...
      http://stratcom.kma-assc.com/uncategorized/pre...
      3
      stratcom.kma-assc.com
      UNK
      False
    
    
      3
      http://ln.is/mabelsaveforschool.com/gbEtv
      http://linkis.com/mabelsaveforschool.com/gbEtv
      0
      mabelsaveforschool.com
      commercial
      True
    
    
      4
      http://kiw.im/16LfJirkfzE
      https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927
      0
      kiwi.qa
      UNK
      True



In [17]:

    
URL_DICT = dict(zip(df[df.URL_CATS != 'UNK'].URL, df[df.URL_CATS != 'UNK'].URL_CATS))
URL_MAPS = dict(zip(df.URL, df.URL_DOMAIN))
URL_EXP_SUCCESS = dict(zip(df.URL, df.URL_EXP_SUCCESS))
len(URL_DICT), df.shape, len(URL_MAPS), len(URL_EXP_SUCCESS)









    Out[17]:





(60586, (97512, 6), 97512, 97512)



In [18]:

    
df.URL.head().values









    Out[18]:





array([ 'http://www.investmentnews.com/article/20160801/FREE/160809992/if-history-is-a-guide-market-volatility-is-about-to-spike',
       'http://ow.ly/3avNPe',
       'http://stratcom.kma-assc.com/uncategorized/press-releases-visit-of-republic-of-korea-r-o-k-deputy-national-security-advisor-cho-tae-yong/',
       'http://ln.is/mabelsaveforschool.com/gbEtv',
       'http://kiw.im/16LfJirkfzE'], dtype=object)



In [19]:

    
URL_MAPS['http://bit.ly/1SqTn5d']









    Out[19]:





'examiner.com'



In [20]:

    
found_urls = 0
twitter_urls = 0
total_urls = 0
tid_mapped_urls = []
url_types = defaultdict(int)
for d in data:
    if 'urls' in d['entities']:
            m_entities = d['entities']['urls']
            for m in m_entities:
                total_urls += 1
                m = m['expanded_url']
                m_cats = "UNK"
                if m in URL_DICT:
                    found_urls += 1
                    m_cats = URL_DICT[m]
                elif m.startswith("https://twitter.com") or m.startswith("http://twitter.com"):
                    found_urls += 1
                    twitter_urls += 1
                    m_cats = "socialmedia|twitter"
                else:
                    m_type = "failed_url"
                    if URL_EXP_SUCCESS[m]:
                        m_type = URL_MAPS.get(m, "None.com")
                    """
                    m_type = m.split("/", 3)[2]
                    #m_type = m_type.split("/", 3)[2]
                    if m_type.startswith("www."):
                        m_type = m_type[4:]
                    """
                    url_types[m_type] += 1
                tid_mapped_urls.append((d["id"], m, m_cats))
print "Data: %s, Total: %s, Found: %s, Twitter: %s" % (len(data), total_urls, found_urls, twitter_urls)
url_types = Counter(url_types)
url_types.most_common(10)









    



Data: 328318, Total: 166670, Found: 118384, Twitter: 24978






    Out[20]:





[('failed_url', 1749),
 ('informationexclusives.com', 170),
 ('toonradio.net', 168),
 ('conservativereport.org', 133),
 ('tallwellnutrition.com', 130),
 ('greenmedinfo.com', 122),
 ('sherif.ws', 105),
 ('americas-fs.org', 104),
 ('mediaite.com', 90),
 ('conservativereview.com', 89)]



In [21]:

    
url_types.most_common(50)









    Out[21]:





[('failed_url', 1749),
 ('informationexclusives.com', 170),
 ('toonradio.net', 168),
 ('conservativereport.org', 133),
 ('tallwellnutrition.com', 130),
 ('greenmedinfo.com', 122),
 ('sherif.ws', 105),
 ('americas-fs.org', 104),
 ('mediaite.com', 90),
 ('conservativereview.com', 89),
 ('thinkprogress.org', 88),
 ('massageenvy.com', 85),
 ('indiewire.com', 83),
 ('webogi.com', 80),
 ('com', 78),
 ('amp.twimg.com', 73),
 ('csoonline.com', 72),
 ('infantway.com', 69),
 ('hotair.com', 69),
 ('alertseditor.com', 68),
 ('teaparty.org', 68),
 ('a.bla.es', 67),
 ('ww1.news-freak.com', 66),
 ('vaccines.news', 65),
 ('reason.com', 65),
 ('smartbrief.com', 65),
 ('talknetwork.com', 64),
 ('newslocker.com', 64),
 ('empleoya.es', 63),
 ('healthlogics.press', 63),
 ('30dayfortune.com', 63),
 ('reverbnation.com', 62),
 ('trap.it', 61),
 ('theconversation.com', 60),
 ('rietta.com', 59),
 ('prnewswire.com', 59),
 ('choiceandtruth.com', 59),
 ('healthy-holistic-living.com', 58),
 ('blogs.wsj.com', 58),
 ('disq.us', 57),
 ('sun-sentinel.com', 57),
 ('wakingtimes.com', 56),
 ('myeclinik.com', 56),
 ('blog.tenthamendmentcenter.com', 56),
 ('usalovelist.com', 56),
 ('therealnews.com', 56),
 ('snopes.com', 56),
 ('theusualroutine.com', 55),
 ('finance.yahoo.com', 55),
 ('organiclifestylemagazine.com', 55)]



In [22]:

    
sum(url_types.values())









    Out[22]:





48286



In [23]:

    
tid_mapped_urls[:10]









    Out[23]:





[(682904901916225536,
  u'https://twitter.com/photogchad_WTSP/status/682903997288681472',
  'socialmedia|twitter'),
 (682915876316692480, u'http://www.investirdanslenfance.ca/', 'UNK'),
 (682985833821941760, u'http://TinyURL.com/NewYearCure', 'commercial'),
 (682952771746664448, u'http://TinyURL.com/NewYearCure', 'commercial'),
 (682830450969059328,
  u'http://yournewswire.com/donald-trump-vaccines-cause-autism/',
  'fakenews'),
 (682998926157418496, u'http://go.shr.lc/1Cq6myS', 'videos'),
 (682924083126767616,
  u'http://www.walesonline.co.uk/news/education/we-improve-education-minister-huw-10630807#ICID=ios_WalesOnlineNewsApp_AppShare_Click_Other',
  'news'),
 (682929300241133569,
  u'https://www.youreducationguides.com/single-parent-involvement/',
  'UNK'),
 (682949413543739392,
  u'http://www.globalresearch.ca/measles-vaccines-kill-more-people-than-measles-cdc-data-proves/5429736',
  'clickbait|fakenews'),
 (682946899259723779, u'http://goo.gl/no4E42', 'commercial')]



In [24]:

    
df_mapped_cats = pd.DataFrame(tid_mapped_urls, columns=["TID", "URL", "CATS"])
df_mapped_cats.head()









    Out[24]:






  
    
      
      TID
      URL
      CATS
    
  
  
    
      0
      682904901916225536
      https://twitter.com/photogchad_WTSP/status/682...
      socialmedia|twitter
    
    
      1
      682915876316692480
      http://www.investirdanslenfance.ca/
      UNK
    
    
      2
      682985833821941760
      http://TinyURL.com/NewYearCure
      commercial
    
    
      3
      682952771746664448
      http://TinyURL.com/NewYearCure
      commercial
    
    
      4
      682830450969059328
      http://yournewswire.com/donald-trump-vaccines-...
      fakenews



In [25]:

    
df_mapped_cats.to_csv("TID_URL_CATS.txt", sep="\t", index=False)
! head TID_URL_CATS.txt









    



TID	URL	CATS
682904901916225536	https://twitter.com/photogchad_WTSP/status/682903997288681472	socialmedia|twitter
682915876316692480	http://www.investirdanslenfance.ca/	UNK
682985833821941760	http://TinyURL.com/NewYearCure	commercial
682952771746664448	http://TinyURL.com/NewYearCure	commercial
682830450969059328	http://yournewswire.com/donald-trump-vaccines-cause-autism/	fakenews
682998926157418496	http://go.shr.lc/1Cq6myS	videos
682924083126767616	http://www.walesonline.co.uk/news/education/we-improve-education-minister-huw-10630807#ICID=ios_WalesOnlineNewsApp_AppShare_Click_Other	news
682929300241133569	https://www.youreducationguides.com/single-parent-involvement/	UNK
682949413543739392	http://www.globalresearch.ca/measles-vaccines-kill-more-people-than-measles-cdc-data-proves/5429736	clickbait|fakenews

Extract tweet features



In [26]:

    
def extract_meta_features(x):
    u_data = x["user"]
    u_url = u_data['url']
    if u_url is not None:
        u_url = u_data['entities']['url']['urls'][0]['expanded_url']
    return (x["id"],
            x['created_at'],
            x['retweet_count'],
            x['favorite_count'], 
            x['in_reply_to_status_id'] is not None,
            'quoted_status' in x and x['quoted_status'] is not None,
            len(x['entities']['hashtags']),
            len(x['entities']['urls']),
            len(x['entities']['user_mentions']),
            0 if 'media' not in x['entities'] else len(x['entities']['media']), # Has photos
            u_data['id'],
            u_data[u'created_at'],
            u_data[u'listed_count'],
            u_data[u'favourites_count'],
            u_data[u'followers_count'],
            u_data[u'friends_count'],
            u_data[u'statuses_count'],
            u_data[u'verified'],
            u_data[u'location'].replace('\r', ''),
            u_data[u'name'].replace('\r',''),
            u_url
           )



In [27]:

    
extract_meta_features(data[0])









    Out[27]:





(682904901916225536,
 u'Fri Jan 01 12:43:11 +0000 2016',
 0,
 0,
 False,
 True,
 0,
 1,
 0,
 0,
 56544119,
 u'Tue Jul 14 00:13:13 +0000 2009',
 68,
 345,
 573,
 503,
 1010,
 False,
 u'St. Petersburg',
 u'Melissa Ramsey',
 None)



In [28]:

    
df_meta = pd.DataFrame((extract_meta_features(d) for d in data),
                      columns=["t_id", "t_created", "t_retweets",
                              "t_favorites", "t_is_reply", "t_is_quote",
                              "t_n_hashtags", "t_n_urls", "t_n_mentions",
                              "t_n_media",
                               "u_id", "u_created",
                               "u_n_listed", "u_n_favorites", "u_n_followers",
                               "u_n_friends", "u_n_statuses",
                               "u_is_verified", "u_location", "u_name", "u_url"
                              ])
df_meta.head()









    Out[28]:






  
    
      
      t_id
      t_created
      t_retweets
      t_favorites
      t_is_reply
      t_is_quote
      t_n_hashtags
      t_n_urls
      t_n_mentions
      t_n_media
      ...
      u_created
      u_n_listed
      u_n_favorites
      u_n_followers
      u_n_friends
      u_n_statuses
      u_is_verified
      u_location
      u_name
      u_url
    
  
  
    
      0
      682904901916225536
      Fri Jan 01 12:43:11 +0000 2016
      0
      0
      False
      True
      0
      1
      0
      0
      ...
      Tue Jul 14 00:13:13 +0000 2009
      68
      345
      573
      503
      1010
      False
      St. Petersburg
      Melissa Ramsey
      None
    
    
      1
      682915876316692480
      Fri Jan 01 13:26:47 +0000 2016
      100
      7
      False
      False
      0
      1
      0
      0
      ...
      Fri Mar 11 07:55:47 +0000 2011
      48
      123
      24864
      6101
      3594
      False
      United States
      Bree Victorie
      None
    
    
      2
      682985833821941760
      Fri Jan 01 18:04:46 +0000 2016
      2
      0
      True
      False
      2
      1
      1
      0
      ...
      Sun Oct 19 18:44:28 +0000 2008
      1151
      8709
      20635
      22698
      207314
      False
      USA
      TannersDad Tim
      http://www.AgeofAutism.com
    
    
      3
      682952771746664448
      Fri Jan 01 15:53:24 +0000 2016
      1
      0
      False
      False
      4
      1
      0
      0
      ...
      Sun Oct 19 18:44:28 +0000 2008
      1151
      8709
      20635
      22698
      207314
      False
      USA
      TannersDad Tim
      http://www.AgeofAutism.com
    
    
      4
      682843745520238592
      Fri Jan 01 08:40:10 +0000 2016
      0
      0
      False
      False
      0
      0
      0
      0
      ...
      Tue Apr 13 09:22:10 +0000 2010
      6
      4
      255
      83
      37291
      False
      All Over Los Angeles
      Duke
      None
    
  

5 rows × 21 columns



In [29]:

    
df_meta.dtypes









    Out[29]:





t_id              int64
t_created        object
t_retweets        int64
t_favorites       int64
t_is_reply         bool
t_is_quote         bool
t_n_hashtags      int64
t_n_urls          int64
t_n_mentions      int64
t_n_media         int64
u_id              int64
u_created        object
u_n_listed        int64
u_n_favorites     int64
u_n_followers     int64
u_n_friends       int64
u_n_statuses      int64
u_is_verified      bool
u_location       object
u_name           object
u_url            object
dtype: object



In [30]:

    
df_meta[df_meta.u_url.apply(lambda x: x is not None)]["u_url"].head()









    Out[30]:





2     http://www.AgeofAutism.com
3     http://www.AgeofAutism.com
5    http://theskepticsguide.org
6    http://google.com/+HelenKap
8         http://www.ralajoy.com
Name: u_url, dtype: object



In [31]:

    
df_meta.to_csv("TID_META.txt", sep="\t", index=False, encoding='utf-8')
! head TID_META.txt









    



t_id	t_created	t_retweets	t_favorites	t_is_reply	t_is_quote	t_n_hashtags	t_n_urls	t_n_mentions	t_n_media	u_id	u_created	u_n_listed	u_n_favorites	u_n_followers	u_n_friends	u_n_statuses	u_is_verified	u_location	u_name	u_url
682904901916225536	Fri Jan 01 12:43:11 +0000 2016	0	0	False	True	0	1	0	0	56544119	Tue Jul 14 00:13:13 +0000 2009	68	345	573	503	1010	False	St. Petersburg	Melissa Ramsey	
682915876316692480	Fri Jan 01 13:26:47 +0000 2016	100	7	False	False	0	1	0	0	264062848	Fri Mar 11 07:55:47 +0000 2011	48	123	24864	6101	3594	False	United States	Bree Victorie	
682985833821941760	Fri Jan 01 18:04:46 +0000 2016	2	0	True	False	2	1	1	0	16854311	Sun Oct 19 18:44:28 +0000 2008	1151	8709	20635	22698	207314	False	USA	TannersDad Tim	http://www.AgeofAutism.com
682952771746664448	Fri Jan 01 15:53:24 +0000 2016	1	0	False	False	4	1	0	0	16854311	Sun Oct 19 18:44:28 +0000 2008	1151	8709	20635	22698	207314	False	USA	TannersDad Tim	http://www.AgeofAutism.com
682843745520238592	Fri Jan 01 08:40:10 +0000 2016	0	0	False	False	0	0	0	0	132465703	Tue Apr 13 09:22:10 +0000 2010	6	4	255	83	37291	False	All Over Los Angeles	Duke	
682832248626876416	Fri Jan 01 07:54:29 +0000 2016	0	0	True	False	0	0	1	0	160073590	Sun Jun 27 03:34:29 +0000 2010	5	41	188	211	28610	False	860	Connor Durden	http://theskepticsguide.org
682830450969059328	Fri Jan 01 07:47:20 +0000 2016	0	0	False	False	1	1	1	0	1216115161	Sun Feb 24 17:39:27 +0000 2013	12	624	428	821	6287	False	Seattle	Helen Kap	http://google.com/+HelenKap
682895562534862848	Fri Jan 01 12:06:04 +0000 2016	0	0	False	False	0	0	0	0	2339295488	Tue Feb 11 22:08:57 +0000 2014	124	1	349	57	1085424	False		Ranier Gray	
682973867627941889	Fri Jan 01 17:17:13 +0000 2016	0	0	False	False	1	0	0	0	1604405960	Thu Jul 18 21:40:24 +0000 2013	21	119	370	708	1558	False		rala brubaker	http://www.ralajoy.com



In [32]:

    
df_meta[df_meta.u_url.apply(lambda x: x is not None)]["u_url"].shape









    Out[32]:





(170675,)



In [33]:

    
df_meta.shape









    Out[33]:





(328318, 21)



In [ ]:

	URL	EXPANDED	EXPANDED_STATUS	URL_DOMAIN	URL_CATS
0	http://www.investmentnews.com/article/20160801...	http://www.investmentnews.com/article/20160801...	0	investmentnews.com	UNK
1	http://ow.ly/3avNPe	https://www.reddit.com/r/cahideas/comments/42i...	0	reddit.com	socialmedia
2	http://stratcom.kma-assc.com/uncategorized/pre...	http://stratcom.kma-assc.com/uncategorized/pre...	3	stratcom.kma-assc.com	UNK
3	http://ln.is/mabelsaveforschool.com/gbEtv	http://linkis.com/mabelsaveforschool.com/gbEtv	0	mabelsaveforschool.com	commercial
4	http://kiw.im/16LfJirkfzE	https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927	0	kiwi.qa	UNK

	TID	URL	CATS
0	682904901916225536	https://twitter.com/photogchad_WTSP/status/682...	socialmedia\|twitter
1	682915876316692480	http://www.investirdanslenfance.ca/	UNK
2	682985833821941760	http://TinyURL.com/NewYearCure	commercial
3	682952771746664448	http://TinyURL.com/NewYearCure	commercial
4	682830450969059328	http://yournewswire.com/donald-trump-vaccines-...	fakenews

	t_id	t_created	t_retweets	t_favorites	t_is_reply	t_is_quote	t_n_hashtags	t_n_urls	t_n_mentions	...	u_created	u_n_listed	u_n_favorites	u_n_followers	u_n_friends	u_n_statuses	u_is_verified	u_location	u_name	u_url
0	682904901916225536	Fri Jan 01 12:43:11 +0000 2016	0	0	False	True	0	1	0	...	Tue Jul 14 00:13:13 +0000 2009	68	345	573	503	1010	False	St. Petersburg	Melissa Ramsey	None
1	682915876316692480	Fri Jan 01 13:26:47 +0000 2016	100	7	False	False	0	1	0	...	Fri Mar 11 07:55:47 +0000 2011	48	123	24864	6101	3594	False	United States	Bree Victorie	None
2	682985833821941760	Fri Jan 01 18:04:46 +0000 2016	2	0	True	False	2	1	1	...	Sun Oct 19 18:44:28 +0000 2008	1151	8709	20635	22698	207314	False	USA	TannersDad Tim	http://www.AgeofAutism.com
3	682952771746664448	Fri Jan 01 15:53:24 +0000 2016	1	0	False	False	4	1	0	...	Sun Oct 19 18:44:28 +0000 2008	1151	8709	20635	22698	207314	False	USA	TannersDad Tim	http://www.AgeofAutism.com
4	682843745520238592	Fri Jan 01 08:40:10 +0000 2016	0	0	False	False	0	0	0	...	Tue Apr 13 09:22:10 +0000 2010	6	4	255	83	37291	False	All Over Los Angeles	Duke	None